package com.kdtech.analyse.NewsPaper;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import com.kdtech.crawler.CrawlHTML;
import com.kdtech.entity.crawler.UrlMeta;
import com.kdtech.entity.data.NewsMeta;
import com.kdtech.utils.DateUtils;
import com.kdtech.utils.HtmlCleaner;
import com.kdtech.utils.StringUtils;
import com.kdtech.analyse.AnalyseNews;

public class GxrbNewsPaperAnalyse implements AnalyseNews {


	
	public boolean isDetailPage(String url) {
		boolean bRet = false;
		//http://www.gxrb.com.cn/html/2012-12/04/content_761728.htm
		//http://ngzb.gxnews.com.cn/html/2012-09/06/content_727915.htm
		//http://epaper.gxnews.com.cn/ngjb/html/2012-12/04/content_2118673.htm
		String[] regex = {
				"http://www.(gxrb|ngzb).com.cn/html/[0-9]{4}-[0-9]{2}/[0-9]{2}/content_[0-9]*.htm",
				"http://(gxrb|ngzb).gxnews.com.cn/html/[0-9]{4}-[0-9]{2}/[0-9]{2}/content_[0-9]*.htm"
				,"http://epaper.gxnews.com.cn/ngjb/html/[0-9]{4}-[0-9]{2}/[0-9]{2}/content_[0-9]*.htm"
				};
		for (int i = 0; i < regex.length; i++) {
			if (url.matches(regex[i])) {
				return true;
			}
		}
		return bRet;
	}

	
	public NewsMeta parserHtml(UrlMeta urlMeta) {
		NewsMeta newspaper = new NewsMeta();
		
		String html = urlMeta.getHtml();
		String url = urlMeta.getUrl();
		
		newspaper.setUrl(url);
		String title = null;
		String content = null;
		Long date = null;
		Document doc = Jsoup.parse(html);

		title =doc.select("html body table tbody tr td table tbody tr td table tbody tr td table tbody tr td.px12c table tbody tr td table tbody tr td span font h1").text();
		if(StringUtils.isBlank(title)){
			title = doc.select("html body table tbody tr td table tbody tr td table tbody tr td div table tbody tr td table tbody tr td span font h1").text();
			if(StringUtils.isBlank(title)){
				title=doc.select("h1").text();
			}
		}
		if (StringUtils.equalsAny(title,"PICC中国人保财险")){
			title = null;
		}
		if (title!=null && title.endsWith("责任编辑")){
			title = null;
		}

		date=DateUtils.matchDate(url.replace("/", "-"));
		content = HtmlCleaner.getContentHtml(url,doc.select("div#ozoom"));

		newspaper.setTitle(StringUtils.trimSpace(title));
		newspaper.setContent(StringUtils.trimSpace(content));
		newspaper.setDate(date);
		return newspaper;
	}

	
	public boolean isTaskPage(String url) {
		boolean bRet = false;
		//http://www.gxrb.com.cn/html/2012-12/04/node_6.htm
		//http://gxrb.gxnews.com.cn/html/2012-12/03/node_6.htm
		//http://epaper.gxnews.com.cn/ngjb/html/2012-12/04/node_315.htm
		String[] regex = {
				"http://www.(gxrb|ngzb).com.cn/html/[0-9]{4}-[0-9]{2}/[0-9]{2}/node_[0-9]*.htm"
				,"http://(gxrb|ngzb).gxnews.com.cn/html/[0-9]{4}-[0-9]{2}/[0-9]{2}/node_[0-9]*.htm"
				,"http://epaper.gxnews.com.cn/ngjb/html/[0-9]{4}-[0-9]{2}/[0-9]{2}/node_[0-9]*.htm"
				};
		for (int i = 0; i < regex.length; i++) {
			if (url.matches(regex[i])) {
				return true;
			}
		}
		return bRet;
	}

	

}
